rm(list=ls())
setwd('.')
library(plyr); library(dplyr)
library(ggplot2); library(reshape2); library(foreign)
library(multiwayvcov); library(lmtest); library(plm)
library(readxl); library(lubridate)
library(haven); library(stringr)
library(car)
library(lme4)

######CMP Data#####
#Load the data.
CMP <- read_dta('data/MPDataset_MPDS2016b.dta', encoding = 'UTF-8')
#Select Portugal
CMP <- subset(CMP, countryname == "Portugal")
#Select Relevant Years
CMP$year <- as.numeric(substring(CMP$date, 1, 4))
CMP <- subset(CMP, year >= 1999)
#Load the mapping from CMP to committee jurisdictions.
CMP.codes <- read_excel('data/Formatted Gov Data.xlsx', sheet = 'CMP')
CMP <- lapply(lapply(str_split(CMP.codes$CMP_code, pattern= ';'), str_trim), FUN=function(i){
  i <- i[i != '']
  o <- CMP[, i]
  o <- data.frame('sum.influence' = rowSums(o))
  o <- cbind(o, CMP[, c('partyname', 'partyabbrev', 'year', 'edate')])
  return(o)
})
CMP <- lapply(1:13, FUN=function(i){
  o <- CMP[[i]]
  o$committee_name <- CMP.codes$committee_name[i]
  return(data.frame(o))
})

CMP <- do.call('rbind', CMP)
CMP <- data.frame(CMP)
dts <- sort(unique(CMP$edate))
CMP$legislature <- (8:12)[match(CMP$edate, dts)]
#Inside of each party, rank the portfolios from least to most salient.
CMP <- ddply(CMP, ~partyname + partyabbrev + legislature, .fun=function(i){
  i$rank <- 13-rank(i$sum.influence)
  i$adjust.rank <- seq(-1, 1, length.out = 13)[rank(i$sum.influence)]
  return(i)
})

class.lookup <- readRDS('data/svm_codes.RDS')

MP.metadata <- read.csv('data/MP_metadata.csv', stringsAsFactors = F, encoding = 'UTF-8')

#Load the classifications from the Python data.
classifed.speeches <- read.csv('data/classifed_speeches.csv', stringsAsFactors = F, encoding = 'UTF-8')
classifed.speeches <- classifed.speeches[, c('mp_id', 'legislature', 'date', 'length_words', 'class_name')]
classifed.speeches$speech.class <- class.lookup$svm_code[match(classifed.speeches$class_name, class.lookup$short_name)]

dta <- merge(classifed.speeches, MP.metadata, all = T, by.y= c('mp_id', 'legislature', 'speech.date'), by.x = c('mp_id', 'legislature', 'date'))
dta$speech.date <- dta$date

#One MP changes party; code him as a new ID for our purposes.
dta$mp_id[which(dta$mp_id == 1740 & dta$legislature == 12)] <- 991740
dta$fmt.date <- dmy(dta$speech.date)

#Load the ministries (portfolio allocation)
ministries <- read_excel('data/Formatted Gov Data.xlsx', sheet ='Posts')
#Get the ranges of government dates.
ranges <- unique(ministries[c('start_date', 'end_date', 'government')])
#Get a mapping from dates to governments.
which.gov <- sapply(1:nrow(ranges), FUN=function(i){dta$fmt.date %within% interval(ranges$start_date[i], ranges$end_date[i])})
#Get the cabinet names.
dta$cabinet_name <- ranges$government[apply(which.gov, MARGIN = 1, FUN=function(i){which(i)})]
#Merge in the auxiliary coalition data.
dta <- ddply(dta, ~cabinet_name, .fun=function(k){
  assignment <- subset(ministries, government == k$cabinet_name[1])
  aux.data <- assignment[match(k$speech.class, assignment$svm_code), c('short_name', 'coalition', 'majority', 'party')]
  names(aux.data) <- c('short_name', 'gov_coalition', 'gov_majority', 'minister_party')
  k <- cbind(k, aux.data)
  return(k)
})
#Get some data by government.
gov.data <- unique(dta[c('party', 'cabinet_name', 'government', 'legislature', 'party.size..n.mps.')])
names(gov.data)[5] <- 'n_mps'
gov.data <- arrange(gov.data, legislature, cabinet_name, party)
governing.parties <- subset(gov.data, government == 1)
governing.parties <- ddply(governing.parties, ~legislature + cabinet_name , .fun=function(i){
  coalition <- as.numeric(nrow(i) > 1)
  if (coalition == 0){
    gov.major <- i$party
    gov.minor <- NA
  }else{
    gov.major <- i$party[which.max(i$n_mps)]
    gov.minor <- i$party[which.min(i$n_mps)]
  }
  minority <- as.numeric(sum(i$n_mps) < 230/2)
  return(data.frame(coalition, gov.major, gov.minor, minority))
})
#party	party  ID. 0=BE; 1= CDS-PP; 2 - PEV; 3 - PCP; 4 - PS; 5 - PSD
dta$party_name <- c('BE', 'CDS', 'PEV', 'PCP', 'PS', 'PSD')[dta$party + 1]
dta$same.minister <- (dta$party_name == dta$minister_party)

#Get a matrix for the committee *assignments* of each MP.
MP.data <- dta[c('mp_id','legislature', 'cabinet_name', grep(names(dta), pattern='^com_[0-9]+$', value=T))] %>%
  group_by(mp_id, legislature, cabinet_name) %>% summarize_all(funs(as.numeric(mean(.) > 0)))
#Get the # of speeches.
speech.counts <- dcast(dta, mp_id + cabinet_name + legislature ~ speech.class , value.var = 'speech.class', fun.aggregate = length)
names(speech.counts)[-1:-3] <- paste('speech_com_', 0:12, sep='')

merged.data <- merge(MP.data, speech.counts, by = c('mp_id', 'legislature', 'cabinet_name'), all = T)
#Transform into LONG data with one row
#per MP, committee and then the # of speeches and the proportion.
long.data <- alply(merged.data, .margin = 1, .fun=function(i){
  speech.vector <- i[paste('speech_com', 0:12, sep='_')]
  committee.vector <- i[paste('com', 0:12, sep='_')]

  df <- data.frame(cbind(t(speech.vector), t(committee.vector)))
  rownames(df) <- NULL
  names(df) <- c('speech_count', 'assigned_committee')
  df$committee.id <- 0:12
  df$mp_id <- i$mp_id
  df$total_speeches <- sum(df$speech_count)
  df$percent_speeches <- df$speech_count/df$total_speeches
  df$legislature <- i$legislature
  df$cabinet_name <- i$cabinet_name
  return(df)
})
long.data <- do.call('rbind', long.data)
long.data$any_speech <- as.numeric(long.data$percent_speeches > 0)
#Collapse the relevant covariates.
relevant.covariates <- dta %>% group_by(mp_id, legislature, cabinet_name, party_name, party) %>% summarise(government = mean(government), seniority = mean(seniority), marginality = mean(electoral.vulnerability), party.size = mean(party.size..0.1.))
#Merge in the relevant covariates
long.data <- merge(long.data, relevant.covariates, by.x = c('mp_id', 'legislature', 'cabinet_name'), by.y = c('mp_id', 'legislature', 'cabinet_name'), all = T)

#Get the proportion of words allocated to each jurisdiction.
word.counts <- dcast(dta, mp_id + cabinet_name + legislature ~ speech.class , value.var = 'length_words', fun.aggregate = sum)
names(word.counts)[-1:-3] <- paste('speech_com_', 0:12, sep='')

merged.data.word <- merge(MP.data, word.counts, by = c('mp_id', 'legislature', 'cabinet_name'), all = T)
#Transform into LONG data with one row
#per MP, committee and then the # of speeches and the proportion.
long.data.word <- alply(merged.data.word, .margin = 1, .fun=function(i){
  speech.vector <- i[paste('speech_com', 0:12, sep='_')]
  committee.vector <- i[paste('com', 0:12, sep='_')]

  df <- data.frame(cbind(t(speech.vector), t(committee.vector)))
  rownames(df) <- NULL
  names(df) <- c('word_count', 'assigned_committee')
  df$committee.id <- 0:12
  df$mp_id <- i$mp_id
  df$total_speeches <- sum(df$word_count)
  df$percent_words <- df$word_count/df$total_speeches
  df$legislature <- i$legislature
  df$cabinet_name <- i$cabinet_name
  return(df)
})
long.data.word <- do.call('rbind', long.data.word)
long.data.word$any_speech <- as.numeric(long.data.word$percent_words > 0)

long.data.word <- merge(long.data.word, relevant.covariates, by.x = c('mp_id', 'legislature', 'cabinet_name'), by.y = c('mp_id', 'legislature', 'cabinet_name'), all = T)
#Should be a perfect match.
print(table(long.data.word$any_speech, long.data$any_speech))
long.data$percent_words <- long.data.word$percent_words

#Note the only non-coalition government.
long.data$coalition[long.data$cabinet_name == 'Guterres 2'] <- 0
#Merge in various information.
long.data$minister_party <- ministries$party[match(paste(long.data$cabinet_name, long.data$committee.id, sep = '@'), paste(ministries$government, ministries$svm_code, sep = '@'))]
long.data$same.minister <- as.numeric(long.data$party_name == long.data$minister_party)
long.data$gov.major <- ministries$party[match(paste(long.data$cabinet_name, long.data$committee.id, sep = '@'), paste(ministries$government, ministries$svm_code, sep = '@'))]
long.data$coalition <- ministries$coalition[match(paste(long.data$cabinet_name, long.data$committee.id, sep = '@'), paste(ministries$government, ministries$svm_code, sep = '@'))]
long.data$major.party <- as.numeric(long.data$party.size >= 0.30)
long.data$gov.senior <- long.data$major.party * long.data$government
long.data$gov.junior <- !long.data$major.party * long.data$government

long.data <- merge(long.data, unique(ministries[c('committee_name_eng' ,'svm_code')]), by.x = 'committee.id', by.y = 'svm_code', all.x = TRUE)
cab.order <- c('Guterres 2', 'Barroso', 'Santana Lopes', 'Socrates 1',
               'Socrates 2', 'Passos Coelho Ia', 'Passos Coelho Ib')
long.data$cabinet_name <- factor(long.data$cabinet_name, levels = cab.order, ordered = TRUE)

MP.position.data <- ddply(dta, ~mp_id + cabinet_name, .progress = 'text', .fun=function(i){
  com.roles <- apply(i[, paste('com', 0:12, sep='_')], MARGIN = 2, max)
  party.roles <- apply(i[, paste('party', 0:12, sep='_')], MARGIN = 2, max)
  com.rank <- apply(i[, paste('pos', 0:12, sep='_')], MARGIN = 2, max)
  df <- data.frame(com.roles, party.roles, com.rank, committee.id = 0:12)
  rownames(df) <- NULL
  return(df)
})

long.data <- merge(long.data, MP.position.data, by = c('mp_id', 'cabinet_name', 'committee.id'), all.x = TRUE)
long.data$party.coordinator <- long.data$party.roles == 1
long.data$formal.rank <- long.data$com.rank > 0

long.data$fmt_committee <- long.data$assigned_committee + long.data$party.coordinator
long.data$fmt_committee <- c('Backbencher', 'Committee Member', 'Party Coordinator')[long.data$fmt_committee + 1]
long.data$member <- c('Backbencher', 'Member')[2 - (long.data$fmt_committee == 'Backbencher')]

long.data$diff.minister <- 1 - long.data$same.minister

#Calculate senority data.

gov.starts <- dta %>% group_by(cabinet_name) %>% summarise(start.date = min(fmt.date))
seniority.data <- ddply(dta, ~mp_id, .fun=function(i){
  output <- i %>% group_by(cabinet_name) %>% summarize(first.seniority = min(seniority), election.date = median(i$fmt.date - days(round(i$seniority * 365, digits  = 0))))
})

seniority.data$gov.start.date <- gov.starts$start.date[match(seniority.data$cabinet_name, gov.starts$cabinet_name)]
seniority.data$exact.time <- as.numeric(difftime(seniority.data$gov.start.date, seniority.data$election.date, units = 'days')/365)


long.data <- merge(long.data, seniority.data, by = c('mp_id', 'cabinet_name'), all.x = TRUE)
long.data$seniority <- long.data$first.seniority

#Merge in manifesto data.
CMP$partyabbrev[CMP$partyabbrev == 'CDS-PP'] <- 'CDS'
long.data <- merge(long.data, CMP, by.x = c('committee_name_eng', 'party_name', 'legislature'),
                   by.y = c('committee_name', 'partyabbrev', 'legislature'), all.x = TRUE)

rm(which.gov)

##Counts per Committee##
long.data$legis.year <- c('1999', '2002', '2005', '2009', '2011')[match(long.data$legislature, 8:12)]
long.data$fmt_com_name <- factor(long.data$committee_name_eng, levels = rev(sort(unique(long.data$committee_name_eng))), ordered = TRUE)
pdf('figures/descriptive_count_of_speeches.pdf')
ggplot(long.data %>% group_by(legis.year, fmt_com_name, committee.id) %>% summarise(total = sum(speech_count))) + geom_bar(aes(x=fmt_com_name, y=total), stat = 'identity') + facet_wrap(~legis.year, nrow = 1) + coord_flip() +
  ylab('Count of Speeches') + xlab('Committee') + theme_bw() + theme(axis.text.x=element_text(angle=90, hjust=1))
dev.off()

pdf('figures/descriptive_who_spoke.pdf')
ggplot(long.data %>% group_by(legis.year, fmt_com_name, committee.id) %>% summarise(total = mean(any_speech))) + geom_bar(aes(x=fmt_com_name, y=total), stat = 'identity') + facet_wrap(~legis.year, nrow = 1) + coord_flip() +
  ylab('Proportion of MPs Who Spoke') + xlab('Committee') + theme_bw()
dev.off()

FE.analysis <- long.data %>% group_by(mp_id, committee_name_eng) %>% summarize(variation.in.assignment = var(assigned_committee))
FE.analysis$any.variation.in.assignment <- FE.analysis$variation.in.assignment > 0
FE.analysis$any.variation.in.assignment[is.na(FE.analysis$any.variation.in.assignment)] <- -1

with(FE.analysis, table(any.variation.in.assignment))



with(subset(long.data, !is.na(adjust.rank)), length(mp_id))
with(subset(long.data, !is.na(adjust.rank)), length(unique(mp_id)))
with(subset(long.data), length(mp_id))
with(subset(long.data), length(unique(mp_id)))


###Loops to Run the Regressions.
assign.coef <- data.frame()

reg.lm.1 <- ~ factor(member) + major.party + log(seniority) + log(marginality) + factor(legislature)
reg.lm.2 <- ~ factor(member) * adjust.rank + major.party + log(seniority) + log(marginality) +  factor(legislature)
reg.lm.3 <- ~ factor(member) * government +  major.party +  log(seniority) + log(marginality) + factor(legislature)
reg.lm.4 <- ~ factor(member) * major.party +  log(seniority) + log(marginality) + factor(legislature)
reg.lm.5 <- ~ factor(member) * (major.party + government + adjust.rank) + log(seniority) + log(marginality) + factor(legislature)

long.data$robust_id <- paste(long.data$mp_id, long.data$committee_name_eng, sep = ' @ ')
write.dta(long.data, 'data/stata_replication.dta')

for (reg.number in 1:5){
  message(reg.number)
  use.data <- long.data
  use.data$quad <- paste(use.data$government, use.data$major.party, sep = '-')
  use.data$robust_id <- paste(use.data$mp_id, use.data$committee_name_eng, sep = ' @ ')
  use.data <- arrange(use.data, robust_id)
  plm.extensive <- plm(update.formula(get(paste('reg.lm', reg.number, sep = '.')), 'any_speech ~ . '), data = use.data, effect = 'individual', index = 'robust_id')
  for (rtype in 1:4){
    if (rtype == 1){#Legis FE
      obj.percent <- lm(update.formula(get(paste('reg.lm', reg.number, sep = '.')), 'percent_speeches ~ .'), use.data)
      obj.extensive <- lm(update.formula(get(paste('reg.lm', reg.number, sep = '.')), 'any_speech ~ .'), use.data)
    }else if (rtype == 2){#Legis/Committee FE
      obj.percent <- lm(update.formula(get(paste('reg.lm', reg.number, sep = '.')), 'percent_speeches ~ . + factor(committee_name_eng)'), use.data)
      obj.extensive <- lm(update.formula(get(paste('reg.lm', reg.number, sep = '.')), 'any_speech ~ . + factor(committee_name_eng)'), use.data)
    }else if (rtype == 3){#MP/Legis/Committee FE
      obj.extensive <- plm(update.formula(get(paste('reg.lm', reg.number, sep = '.')), 'any_speech ~ . + factor(committee_name_eng)'), data = use.data, effect = 'individual', index = 'mp_id')
      obj.percent <- plm(update.formula(get(paste('reg.lm', reg.number, sep = '.')), 'percent_speeches ~ . + factor(committee_name_eng)'), data = use.data, effect = 'individual', index = 'mp_id')
    }else if (rtype == 4){#Main Result: Legis/MP-Committee FE
      obj.extensive <- plm(update.formula(get(paste('reg.lm', reg.number, sep = '.')), 'any_speech ~ . + factor(committee_name_eng)'), data = use.data, effect = 'individual', index = 'robust_id')
      obj.percent <- plm(update.formula(get(paste('reg.lm', reg.number, sep = '.')), 'percent_speeches ~ . + factor(committee_name_eng)'), data = use.data, effect = 'individual', index = 'robust_id')
    }

    if (class(obj.extensive)[1] == 'lm'){
      coef.extensive <- unclass(coeftest(obj.extensive, cluster.vcov(obj.extensive, subset(use.data)$mp_id)))
    }else{
      coef.extensive <- unclass(coeftest(obj.extensive, vcovHC(obj.extensive, cluster = 'group')))
    }
    if (class(obj.percent)[1] == 'lm'){
      coef.percent <- unclass(coeftest(obj.percent, cluster.vcov(obj.percent, subset(use.data)$mp_id)))
    }else{
      coef.percent <- unclass(coeftest(obj.percent, vcovHC(obj.percent, cluster = 'group')))
    }
    if (reg.number == 2 & rtype %in% c(3,4)){
      message('Salience', rtype)
      salience.per.coef <- coeftest(obj.percent)[grepl(rownames(coeftest(obj.percent)), pattern='adjust.rank'), ]
      salience.ext.coef <- coeftest(obj.extensive)[grepl(rownames(coeftest(obj.extensive)), pattern='adjust.rank'), ]

      salience.per.vcov <- vcovHC(obj.percent, cluster = 'group')
      salience.per.vcov <- salience.per.vcov[grepl(rownames(salience.per.vcov), pattern='adjust.rank'),grepl(rownames(salience.per.vcov), pattern='adjust.rank')]

      salience.ext.vcov <- vcovHC(obj.extensive, cluster = 'group')
      salience.ext.vcov <- salience.ext.vcov[grepl(rownames(salience.ext.vcov), pattern='adjust.rank'),grepl(rownames(salience.ext.vcov), pattern='adjust.rank')]
      #Var(b0 + b1) = Var(b0) + Var(b1) + 2Cov(b0,b1)
      var.inter.ext <- sum(salience.ext.vcov[1,] * c(1,2)) + salience.ext.vcov[2,2]
      coef.inter.ext <- sum(salience.ext.coef[,1])
      t.inter.ext <- coef.inter.ext/sqrt(var.inter.ext)
      coef.inter.per <- sum(salience.per.coef[,1])
      var.inter.per <- sum(salience.per.vcov[1,] * c(1,2)) + salience.per.vcov[2,2]
      t.inter.per <- sum(salience.per.coef[,1])/sqrt(var.inter.per)
      print(data.frame('coef.ext.inter' = coef.inter.ext, 'coef.per.inter' = coef.inter.per))
      print(linearHypothesis(obj.extensive, 'adjust.rank + factor(member)Member:adjust.rank = 0', vcov = vcovHC(obj.extensive, cluster = 'group'), test = 'F'))
      print(linearHypothesis(obj.percent, 'adjust.rank + factor(member)Member:adjust.rank = 0', vcov = vcovHC(obj.percent, cluster = 'group'), test = 'F'))

    }
    if (reg.number %in% c(3,5) & rtype %in% c(3,4)){
      message('GOV INTERTEST', rtype)
      gov.per.coef <- coeftest(obj.percent)[grepl(rownames(coeftest(obj.percent)), pattern='govern'), ]
      gov.ext.coef <- coeftest(obj.extensive)[grepl(rownames(coeftest(obj.extensive)), pattern='govern'), ]

      gov.per.vcov <- vcovHC(obj.percent, cluster = 'group')
      gov.per.vcov <- gov.per.vcov[grepl(rownames(gov.per.vcov), pattern='govern'),grepl(rownames(gov.per.vcov), pattern='govern')]

      gov.ext.vcov <- vcovHC(obj.extensive, cluster = 'group')
      gov.ext.vcov <- gov.ext.vcov[grepl(rownames(gov.ext.vcov), pattern='govern'),grepl(rownames(gov.ext.vcov), pattern='govern')]
      #Var(b0 + b1) = Var(b0) + Var(b1) + 2Cov(b0,b1)
      var.inter.ext <- sum(gov.ext.vcov[1,] * c(1,2)) + gov.ext.vcov[2,2]
      t.inter.ext <- sum(gov.ext.coef[,1])/sqrt(var.inter.ext)
      var.inter.per <- sum(gov.per.vcov[1,] * c(1,2)) + gov.per.vcov[2,2]
      t.inter.per <- sum(gov.per.coef[,1])/sqrt(var.inter.per)
      print(data.frame('ext.inter' = t.inter.ext, 'per.inter' = t.inter.per))
      print(linearHypothesis(obj.extensive, 'government + factor(member)Member:government = 0', vcov = vcovHC(obj.extensive, cluster = 'group'), test = 'F'))
      print(linearHypothesis(obj.percent, 'government + factor(member)Member:government = 0', vcov = vcovHC(obj.percent, cluster = 'group'), test = 'F'))
    }
    coef.extensive <- data.frame(coef.extensive)
    coef.percent <- data.frame(coef.percent)
    coef.extensive$var <- rownames(coef.extensive)
    coef.percent$var <- rownames(coef.percent)
    coef.percent$type <- 'Percent'
    coef.extensive$type <- 'Extensive'
    bind <- rbind(coef.percent, coef.extensive) #coef.intensive
    names(bind) <- c('est', 'se', 't', 'pr', 'var', 'type')
    bind$reg.number <- reg.number
    bind$reg.model <- rtype
    assign.coef <- rbind(assign.coef, bind)
  }
}
rownames(assign.coef) <- NULL

old.variables <- c('(Intercept)', 'factor(member)Member',
                   'adjust.rank','factor(member)Member:adjust.rank',
                   'government', 'factor(member)Member:government',
                   'major.party', 'factor(member)Member:major.party',
   'log(seniority)', 'log(marginality)')

clean.variables <- c(
  'Intercept', 'Member',
  'Salience','Member X Salience',
  'Government', 'Member X Government',
  'Major Party','Member X Major Party',
  'Seniority (Log)',
  'Marginality (Log)')

assign.coef$fmt.name <- clean.variables[match(assign.coef$var, old.variables)]
assign.coef$fmt.name <- factor(assign.coef$fmt.name, levels = clean.variables, ordered = TRUE)

format.regtable <- function(dta, cast.fmla, CI){
  dta <- subset(dta, !grepl(var, pattern='seniority|marginality|legislature|committee_name|Intercept'))
  dta <- subset(dta, !is.na(fmt.name))

  cast.coef <- dcast(dta, cast.fmla, value.var = 'est')

  norder <- cast.coef$fmt.name

  cast.coef <- cast.coef[,-1, drop = F]

  dta$ci <- paste('[', sprintf('%.3f',round(dta$est - 1.96 * dta$se,3)), ', ', sprintf('%.3f',round(dta$est + 1.96 * dta$se, 3)), ']', sep = '')
  cast.CI <- dcast(dta, cast.fmla, value.var ='ci')[,-1, drop = F]
  cast.se <- dcast(dta, cast.fmla, value.var = 'se')[,-1, drop = F]
  cast.p <- dcast(dta, cast.fmla, value.var = 'pr')[,-1, drop = F]

  cast.coef <- round(cast.coef, digits = 3)
  cast.se <- round(cast.se, digits = 3)

  cast.coef[is.na(cast.coef)] <- ''
  cast.se[is.na(cast.se)] <- ''
  cast.CI[is.na(cast.CI)] <- ''

  cast.coef[cast.coef != ''] <- sprintf('%.3f', as.numeric(cast.coef[cast.coef != '']))

  cast.se[cast.se != ''] <- sprintf('%.3f', as.numeric(cast.se[cast.se != '']))
  cast.se[cast.se != ''] <- paste('(', cast.se[cast.se != ''], ')', sep = '')

  if (!CI){
    cast.coef[cast.p < 0.01 & !is.na(cast.p)] <- paste(cast.coef[cast.p < 0.01 & !is.na(cast.p)], '$^{***}$', sep='')
    cast.coef[cast.p >= 0.01 & cast.p < 0.05 & !is.na(cast.p)] <- paste(cast.coef[cast.p >= 0.01 & cast.p < 0.05 & !is.na(cast.p)], '$^{**}$', sep='')
    cast.coef[cast.p >= 0.05 & cast.p < 0.1 & !is.na(cast.p)] <- paste(cast.coef[cast.p >= 0.05 & cast.p < 0.1 & !is.na(cast.p)], '$^{*}$', sep='')

    cast.output <- do.call('rbind', lapply(1:length(norder), FUN=function(i){
      rbind(cast.coef[i,], cast.se[i,])
    }))
  }else{

    cast.output <- do.call('rbind', lapply(1:length(norder), FUN=function(i){
      rbind(cast.coef[i,], cast.CI[i,])
    }))
  }

  cast.output <- cbind(c(rbind(as.character(norder), rep('', length(norder)))), cast.output)

}

create.regression.robustness <-  ddply(assign.coef, ~reg.number + type, .fun=function(dta){
  format.regtable(dta, cast.fmla = fmt.name ~ reg.model, CI = F)
})
names(create.regression.robustness) <- c('regression.type', 'dv', 'variable', '(1)','(2)', '(3)', '(4)')

create.regression.main <-  ddply(assign.coef, ~ reg.model + type, .fun=function(dta){
  format.regtable(dta, cast.fmla = fmt.name ~ reg.number, CI = F)
})
names(create.regression.main) <- c('regression.model', 'dv', 'variable', '(1)','(2)', '(3)', '(4)', '(5)')

#Do [,-1:-2] to show all one-at-a-time models.
writeLines(paste(apply(subset(create.regression.main, regression.model == 4 & dv == 'Percent')[,c(3,4,8)], MARGIN = 1, FUN=function(i){paste(i, collapse = ' & ')}), '\\\\'),
           'figures/tab_percent_model_4.tex')
writeLines(paste(apply(subset(create.regression.main, regression.model == 4 & dv == 'Extensive')[,c(3,4,8)], MARGIN = 1, FUN=function(i){paste(i, collapse = ' & ')}), '\\\\'),
           'figures/tab_extensive_model_4.tex')
writeLines(paste(apply(subset(create.regression.main, regression.model == 3 & dv == 'Percent')[,c(3,4,8)], MARGIN = 1, FUN=function(i){paste(i, collapse = ' & ')}), '\\\\'),
           'figures/tab_percent_model_3.tex')
writeLines(paste(apply(subset(create.regression.main, regression.model == 3 & dv == 'Extensive')[,c(3,4,8)], MARGIN = 1, FUN=function(i){paste(i, collapse = ' & ')}), '\\\\'),
           'figures/tab_extensive_model_3.tex')

#Robustness Results for the Supplementary Information:
library(survival)
#Fixed Effect Logistic Regression:
clogit.any <- clogit(any_speech ~ factor(member) * (adjust.rank + government + major.party) +
                       log(seniority) + log(marginality) + factor(legislature) +
                       strata(robust_id), data = long.data)
summary(clogit.any)
clogit.any <- data.frame(unclass(coeftest(clogit.any)))
names(clogit.any) <- c('est', 'se', 't', 'pr')
clogit.any$var <- rownames(clogit.any)
clogit.any <- subset(clogit.any, !is.na(est))
clogit.any$regression.model <- 1:nrow(clogit.any)
clogit.any$reg.type <- 1

clogit.any$fmt.name <- clean.variables[match(clogit.any$var, old.variables)]
clogit.any$fmt.name <- factor(clogit.any$fmt.name, levels = clean.variables, ordered = TRUE)
clogit.any <- format.regtable(dta = clogit.any, cast.fmla = fmt.name ~ reg.type, CI = TRUE)

clogit.any <- cbind(subset(create.regression.main, regression.model == 4 & dv == 'Extensive')[,c(3, ncol(create.regression.main))], clogit.any[,2])
names(clogit.any)[2:3] <- c('FE-OLS', 'FE-Logit')

writeLines(paste(apply(clogit.any, MARGIN = 1, FUN=function(i){paste(i, collapse = ' & ')}), '\\\\'),
           'figures/fe_logit.tex')

#Alterative DV:
word.plm <- plm(percent_words ~ factor(member) * (adjust.rank + government + major.party) +
                  log(seniority) + log(marginality) + factor(legislature), data = long.data, index = 'robust_id')
word.plm <- data.frame(unclass(coeftest(word.plm, vcovHC(word.plm))))
names(word.plm) <- c('est', 'se', 't', 'pr')
word.plm$var <- rownames(word.plm)
word.plm <- subset(word.plm, !is.na(est))
word.plm$regression.model <- 1:nrow(word.plm)
word.plm$reg.type <- 1

word.plm$fmt.name <- clean.variables[match(word.plm$var, old.variables)]
word.plm$fmt.name <- factor(word.plm$fmt.name, levels = clean.variables, ordered = TRUE)
word.plm <- format.regtable(dta = word.plm, cast.fmla = fmt.name ~ reg.type, CI = TRUE)

word.plm <- cbind(subset(create.regression.main, regression.model == 4 & dv == 'Percent')[,c(3, ncol(create.regression.main))], word.plm[,2])
names(word.plm)[2:3] <- c('Prop. Speeches', 'Prop. Words')

writeLines(paste(apply(word.plm, MARGIN = 1, FUN=function(i){paste(i, collapse = ' & ')}), '\\\\'),
           'figures/words_robustness.tex')

ggplot() + geom_point(aes(x=long.data$percent_words, y=long.data$percent_speeches))
print(cor(x=long.data$percent_words, y=long.data$percent_speeches))

#PreTrend Pictures:

dta$month <- month(dta$fmt.date)
dta$year <- year(dta$fmt.date)

FE.time <- dta[c('mp_id','month', 'year', grep(names(dta), pattern='^com_[0-9]+$', value=T))] %>%
  group_by(mp_id, month, year) %>% summarize_all(funs(as.numeric(mean(.) > 0)))

FE.count <- dcast(dta, mp_id + month + year ~ speech.class, value.var = 'speech.class', fun.aggregate = length)

names(FE.count)[-1:-3] <- paste('speech_com_', 0:12, sep='')

FE.merged <- merge(FE.time, FE.count, by = c('mp_id', 'month', 'year'), all = T)

FE.merged <- alply(FE.merged, .margin = 1, .fun=function(i){
  speech.vector <- i[paste('speech_com', 0:12, sep='_')]
  committee.vector <- i[paste('com', 0:12, sep='_')]

  df <- data.frame(cbind(t(speech.vector), t(committee.vector)))
  rownames(df) <- NULL
  names(df) <- c('speech_count', 'assigned_committee')
  df$committee.id <- 0:12
  df$mp_id <- i$mp_id
  df$total_speeches <- sum(df$speech_count)
  df$percent_speeches <- df$speech_count/df$total_speeches
  df$month <- i$month
  df$year <- i$year
  return(df)
})
FE.merged <- do.call('rbind', FE.merged)
FE.merged$any_speech <- as.numeric(FE.merged$percent_speeches > 0)
FE.merged$pseudo.date <- with(FE.merged, ymd(paste(year, month, 1, sep = '-')))
FE.merged <- arrange(FE.merged, pseudo.date)

pretrend <- ddply(FE.merged, ~ mp_id + committee.id, .progress = 'text', .inform=T, .fun=function(d){
  if (length(unique(d$assigned_committee)) > 1 & d$assigned_committee[1] == 0){
    which.on <- which(d$assigned_committee == 1)
    d <- d[1:max(which.on),]
    breaks <- c(0, which(diff(d$assigned_committee) == -1), nrow(d))
    output <- data.frame()
    for (b in 1:(length(breaks)-1)){
      sub.d <- d[(breaks[b]+1):breaks[b+1],]
      sub.d$time.diff <- as.vector(sub.d$pseudo.date - sub.d$pseudo.date[min(which(sub.d$assigned_committee == 1))])
      sub.d$mean.per <- mean(sub.d$percent_speeches)
      sub.d$mean.any <- mean(sub.d$any_speech)
      output <- rbind(output, sub.d)
    }
    return(output)
  }else{
    return(data.frame())
  }
})
pretrend$floor <- floor(pretrend$time.diff/30)
pretrend$fmt.membership <- c('No', 'Yes')[pretrend$assigned_committee + 1]



pdf('figures/pretend_any.pdf')
ggplot(subset(pretrend, floor %in% seq(-24, 24)) %>% group_by(floor, fmt.membership) %>% summarize(n = n(), value = mean(any_speech - mean.any))) +
  geom_point(aes(x=floor, y=value, col = factor(fmt.membership))) +
  geom_point(aes(x=floor, y=value, size = n, col = factor(fmt.membership))) +
  geom_vline(aes(xintercept=0)) + xlab('Time to Membership') + ylab('Deviation from Baseline') +
  labs(col = 'Committee\nMember') + scale_size_continuous(guide = FALSE) + theme_bw()
dev.off()

pdf('figures/pretrend_percent.pdf')
ggplot(subset(pretrend, floor %in% seq(-24, 24)) %>% group_by(floor, fmt.membership) %>% summarize(n = n(), value = mean(percent_speeches - mean.per))) +
  geom_point(aes(x=floor, y=value, size = n, col = factor(fmt.membership))) +
  geom_vline(aes(xintercept=0)) + xlab('Time to Membership') + ylab('Deviation from Baseline') +
  labs(col = 'Committee\nMember') + scale_size_continuous(guide = FALSE) + theme_bw()
dev.off()

head(pretrend)

party.activity <- long.data %>% group_by(legislature, party_name, committee_name_eng, adjust.rank) %>% summarise(party_speeches = sum(speech_count))
party.activity <- merge(party.activity, long.data %>% group_by(legislature, party_name) %>% summarise(all.count = sum(speech_count)), by = c('party_name', 'legislature'))

ggplot(party.activity)+ geom_point(aes(x=adjust.rank, y=party_speeches/all.count)) + geom_smooth(aes(x=adjust.rank, y=party_speeches/all.count)) + facet_wrap(~party_name)

cor.test(party.activity$adjust.rank, party.activity$party_speeches/party.activity$all.count, use = 'pairwise.complete')


word.plm <- plm(percent_speeches ~ factor(member) * (adjust.rank + government + major.party) +
                  log(seniority) + log(marginality) + factor(legislature), data = long.data, index = 'robust_id')
coeftest(word.plm, vcovHC(word.plm))
